R/makeBins.R
In AneuFinder: Analysis of Copy Number Variation in Single-Cell-Sequencing Data

Documented in fixedWidthBins variableWidthBins

#' Bin the genome
#'
#' Please see functions \code{\link{fixedWidthBins}} and \code{\link{variableWidthBins}} for further details.
#'
#' @name binning
NULL


#' Make fixed-width bins
#'
#' Make fixed-width bins based on given bin size.
#'
#' @param bamfile A BAM file from which the header is read to determine the chromosome lengths. If a \code{bamfile} is specified, option \code{assembly} is ignored.
#' @param assembly An assembly from which the chromosome lengths are determined. Please see \code{\link[GenomeInfoDb]{getChromInfoFromUCSC}} for available assemblies. This option is ignored if \code{bamfile} is specified. Alternatively a data.frame generated by \code{\link[GenomeInfoDb]{getChromInfoFromUCSC}}.
#' @param chrom.lengths A named character vector with chromosome lengths. Names correspond to chromosomes.
#' @param chromosome.format A character specifying the format of the chromosomes if \code{assembly} is specified. Either 'NCBI' for (1,2,3 ...) or 'UCSC' for (chr1,chr2,chr3 ...). If a \code{bamfile} or \code{chrom.lengths} is supplied, the format will be chosen automatically.
#' @param binsizes A vector of bin sizes in base pairs.
#' @param stepsizes A vector of step sizes in base pairs, the same length as \code{binsizes}.
#' @param chromosomes A subset of chromosomes for which the bins are generated.
#' @return A \code{list()} of \code{\link{GRanges-class}} objects with fixed-width bins. If \code{stepsizes} is specified, a \code{list()} of \code{\link{GRangesList}} objects with one entry per step.
#' @author Aaron Taudt
#' @importFrom Rsamtools BamFile
#' @export
#'
#'@examples
#'## Make fixed-width bins of size 500kb and 1Mb
#'bins <- fixedWidthBins(assembly='mm10', chromosome.format='NCBI', binsizes=c(5e5,1e6))
#'bins
#'
fixedWidthBins <- function(bamfile=NULL, assembly=NULL, chrom.lengths=NULL, chromosome.format, binsizes=1e6, stepsizes=NULL, chromosomes=NULL) {

	### Check user input ###
  if (length(binsizes) == 0) {
    return(list())
  }
	if (is.null(bamfile) & is.null(assembly) & is.null(chrom.lengths)) {
		stop("Please specify either a 'bamfile', 'assembly' or 'chrom.lengths'")
	}
	if (is.null(bamfile) & is.null(chrom.lengths)) {
		trigger.error <- chromosome.format
	}
  if (!is.null(stepsizes)) {
    if (length(stepsizes) != length(binsizes)) {
      stop("Need one element in 'stepsizes' for each element in 'binsizes'.")
    }
    if (any(binsizes < stepsizes)) {
        stop("'stepsizes' must be smaller/equal than 'binsizes'")
    }
  }

	### Get chromosome lengths ###
	if (!is.null(bamfile)) {
      ptm <- startTimedMessage(paste0("Reading header from ", bamfile, " ..."))
      chrom.lengths <- GenomeInfoDb::seqlengths(Rsamtools::BamFile(bamfile))
      stopTimedMessage(ptm)
	} else if (!is.null(assembly)) {
		if (is.character(assembly)) {
			ptm <- startTimedMessage("Fetching chromosome lengths from UCSC ...")
			df <- GenomeInfoDb::getChromInfoFromUCSC(assembly)
			stopTimedMessage(ptm)
		} else if (is.data.frame(assembly)) {
			df <- assembly
		} else {
			stop("Unknown assembly")
		}
		chrom.lengths <- df$size
		if (chromosome.format=='UCSC') {
		} else if (chromosome.format=='NCBI') {
                	df$chrom = sub('^chr', '', df$chrom)
		}
		names(chrom.lengths) <- df$chrom
		chrom.lengths <- chrom.lengths[!is.na(names(chrom.lengths))]
		chrom.lengths <- chrom.lengths[!is.na(chrom.lengths)]
	} else if (!is.null(chrom.lengths)) {
		chrom.lengths <- chrom.lengths[!is.na(names(chrom.lengths))]
		chrom.lengths <- chrom.lengths[!is.na(chrom.lengths)]
	}
	chroms.in.data <- names(chrom.lengths)
	if (is.null(chromosomes)) {
		chromosomes <- chroms.in.data
	}
	chroms2use <- intersect(chromosomes, chroms.in.data)
	## Stop if none of the specified chromosomes exist
	if (length(chroms2use)==0) {
		chrstring <- paste0(chromosomes, collapse=', ')
		stop('Could not find length information for any of the specified chromosomes: ', chrstring, '. Pay attention to the naming convention in your data, e.g. "chr1" or "1".')
	}
	## Issue warning for non-existent chromosomes
	diff <- setdiff(chromosomes, chroms.in.data)
	if (length(diff)>0) {
		diffs <- paste0(diff, collapse=', ')
		warning('Could not find length information for the following chromosomes: ', diffs)
	}

	### Making fixed-width bins ###
	bins.list <- list()
	for (ibinsize in 1:length(binsizes)) {
	  binsize <- binsizes[ibinsize]
    ptm <- startTimedMessage("Making fixed-width bins for bin size ", binsize, " ...")
    chrom.lengths.floor <- floor(chrom.lengths / binsize) * binsize
    bins <- unlist(GenomicRanges::tileGenome(chrom.lengths.floor[chroms2use], tilewidth=binsize), use.names=FALSE)
    bins <- bins[end(bins) > 0] # no chromosomes that are smaller than binsize
    if (any(width(bins)!=binsize)) {
      stop("tileGenome failed")
    }
		# seqlengths(bins) <- as.integer(chrom.lengths[names(seqlengths(bins))])
    seqlengths(bins) <- chrom.lengths[chroms2use]
    if (!is.null(stepsizes)) {
      shift.bp <- 0
  	  stepsize <- stepsizes[ibinsize]
  	  bins.list.step <- GRangesList()
  	  while (shift.bp < binsize) {
  	    bins.list.step[[as.character(shift.bp)]] <- suppressWarnings( trim(shift(bins, shift.bp)) )
        shift.bp <- stepsize + shift.bp
  	  }
  	  bins.list[[paste0('binsize_', format(binsize, scientific=TRUE, trim=TRUE), '_stepsize_', format(stepsize, scientific=TRUE, trim=TRUE))]] <- bins.list.step
    } else {
  		bins.list[[paste0('binsize_', format(binsize, scientific=TRUE, trim=TRUE))]] <- bins
    }

    skipped.chroms <- setdiff(seqlevels(bins), as.character(unique(seqnames(bins))))
    if (length(skipped.chroms)>0) {
        warning("The following chromosomes were skipped because they are smaller than binsize ", binsize, ": ", paste0(skipped.chroms, collapse=', '))
    }
    stopTimedMessage(ptm)

	}

	return(bins.list)

}


#' Make variable-width bins
#' 
#' Make variable-width bins based on a reference BAM file. This can be a simulated file (produced by \code{\link{simulateReads}} and aligned with your favourite aligner) or a real reference.
#' 
#' Variable-width bins are produced by first binning the reference BAM file with fixed-width bins and selecting the desired number of reads per bin as the (non-zero) maximum of the histogram. A new set of bins is then generated such that every bin contains the desired number of reads.
#' 
#' @param reads A \code{\link{GRanges-class}} with reads. See \code{\link{bam2GRanges}} and \code{\link{bed2GRanges}}.
#' @param binsizes A vector with binsizes. Resulting bins will be close to the specified binsizes.
#' @param stepsizes A vector of step sizes in base pairs, the same length as \code{binsizes}.
#' @param chromosomes A subset of chromosomes for which the bins are generated.
#' @return A \code{list()} of \code{\link{GRanges-class}} objects with variable-width bins. If \code{stepsizes} is specified, a \code{list()} of \code{\link{GRangesList}} objects with one entry per step.
#' @author Aaron Taudt
#' @importFrom S4Vectors endoapply
#' @export
#'
#'@examples
#'## Get an example BED file with single-cell-sequencing reads
#'bedfile <- system.file("extdata", "KK150311_VI_07.bam.bed.gz", package="AneuFinderData")
#'## Read the file into a GRanges object
#'reads <- bed2GRanges(bedfile, assembly='mm10', chromosomes=c(1:19,'X','Y'),
#'                     min.mapq=10, remove.duplicate.reads=TRUE)
#'## Make variable-width bins of size 500kb and 1Mb
#'bins <- variableWidthBins(reads, binsizes=c(5e5,1e6))
#'## Plot the distribution of binsizes
#'hist(width(bins[['binsize_1e+06']]), breaks=50)
#'
variableWidthBins <- function(reads, binsizes, stepsizes=NULL, chromosomes=NULL) {
	
	### Check user input ###
	chroms.in.data <- seqlevels(reads)
	if (is.null(chromosomes)) {
		chromosomes <- chroms.in.data
	}
	chroms2use <- intersect(chromosomes, chroms.in.data)
	## Stop if non of the specified chromosomes exist
	if (length(chroms2use)==0) {
		chrstring <- paste0(chromosomes, collapse=', ')
		stop('Could not find length information for any of the specified chromosomes: ', chrstring)
	}
	## Issue warning for non-existent chromosomes
	diff <- setdiff(chromosomes, chroms.in.data)
	if (length(diff)>0) {
		diffs <- paste0(diff, collapse=', ')
		warning('Could not find length information for the following chromosomes: ', diffs)
	}
  if (!is.null(stepsizes)) {
    if (length(stepsizes) != length(binsizes)) {
      stop("Need one element in 'stepsizes' for each element in 'binsizes'.")
    }
    if (any(binsizes < stepsizes)) {
        stop("'stepsizes' must be smaller/equal than 'binsizes'")
    }
  }

	## Drop unwanted seqlevels
	reads <- reads[seqnames(reads) %in% chroms2use]
	reads <- keepSeqlevels(reads, chroms2use)

	## Make fixed width bins
	ptm <- startTimedMessage("Binning reads in fixed-width windows ...")
	binned.list <- suppressMessages( binReads(reads, assembly=NULL, binsizes=binsizes, stepsizes=stepsizes, calc.complexity=FALSE, chromosomes=chromosomes) )
	stopTimedMessage(ptm)
	
	## Sort the reads
	strand(reads) <- '*'
	reads <- sort(reads)
	
	## Loop over binsizes
	bins.list <- list()
	for (i1 in 1:length(binsizes)) {
		binsize <- binsizes[i1]
		if (is.null(stepsizes)) {
  		ptm <- startTimedMessage("Making variable-width windows for bin size ", binsize, " ...")
		} else {
  		ptm <- startTimedMessage("Making variable-width windows for bin size ", binsize, " and step size ", stepsizes[i1], " ...")
		}
		if (is(binned.list[[i1]], "GRangesList")) {
  		binned <- binned.list[[i1]][[1]]
		} else if (is(binned.list[[i1]], "GRanges")) {
  		binned <- binned.list[[i1]]
		}
		## Get median of histogram
		mediancount <- as.integer(median(binned$counts[binned$counts>0]))
		mediancount.perstep <- 0
		if (!is.null(stepsizes)) {
		  stepsize <- stepsizes[i1]
		  numsteps <- as.integer(binsize / stepsize)
  		mediancount.perstep <- unique(as.integer((0:(numsteps-1)) * mediancount / numsteps))
		}
		bins.list.step <- GRangesList()
		for (istep in 1:length(mediancount.perstep)) {
  		## Pick only every mediancount read
  		subreads <- GRangesList()
  		skipped.chroms <- character()
  		for (chrom in chroms2use) {
  			reads.chr <- reads[seqnames(reads)==chrom]
  			if (length(reads.chr) >= mediancount) {
  			  if (mediancount.perstep[istep] == 0) {
    				idx <- seq(mediancount, length(reads.chr), by=mediancount)
  			  } else {
    				idx <- seq(mediancount.perstep[istep], length(reads.chr), by=mediancount)
  			  }
  				subreads[[chrom]] <- reads.chr[idx]
  			} else {
  				skipped.chroms[chrom] <- chrom
  			}
  		}
  		subreads <- unlist(subreads, use.names=FALSE)
  		## Adjust length of reads to get consecutive bins
  		subreads <- resize(subreads, width=1)
  		## Make new bins
  		bins <- gaps(subreads, start=1L, end=seqlengths(subreads)-1L) # gaps until seqlengths-1 because we have to add 1 later to get consecutive bins
  		bins <- bins[strand(bins)=='*']
  		end(bins) <- end(bins) + 1
  		## Remove first bin
  		if (mediancount.perstep[istep] > 0) {
  		  bins <- bins[-1]
  		}
  		## We don't want incomplete bins at the end
  		bins.split <- split(bins, seqnames(bins))
  		bins.split <- endoapply(bins.split, function(x) { x[-length(x)] })
  		bins <- unlist(bins.split, use.names=FALSE)
  		## Remove skipped chromosomes
  		bins <- bins[!seqnames(bins) %in% skipped.chroms]
  		bins <- keepSeqlevels(bins, setdiff(seqlevels(bins), skipped.chroms))
  		bins.list.step[[as.character(istep)]] <- bins
		}
		if (length(skipped.chroms)>0) {
			warning("The following chromosomes were skipped because they are smaller than binsize ", binsize, ": ", paste0(skipped.chroms, collapse=', '))
		}

		if (is.null(stepsizes)) {
  		bins.list[[paste0('binsize_', format(binsize, scientific=TRUE, trim=TRUE))]] <- bins.list.step[[1]]
		} else {
  		bins.list[[paste0('binsize_', format(binsize, scientific=TRUE, trim=TRUE), '_stepsize_', format(stepsize, scientific=TRUE, trim=TRUE))]] <- bins.list.step
		}
		stopTimedMessage(ptm)
	}
	
	return(bins.list)

}