R/comppoiss_wrapper.R
In motifcounter: R package for analysing TFBSs in DNA sequences

Documented in clumpSizeDist compoundPoissonDist

#' Compound Poisson Approximation
#'
#' This function approximates the distribution of the number of motif hits
#' that emerges from a random DNA sequence of a given length.
#' 
#' The distribution can be determined in two alternative ways:
#' \enumerate{
#' \item A re-implemented version of the algorithm that was
#' described in Pape et al. \emph{Compound poisson approximation
#' of the number of occurrences of a position
#' frequency matrix (PFM) on both strands.} 2008
#' can be invoked using method='pape'.
#' The main purpose of this implementation concerns 
#' benchmarking an improved approximation.
#' In contrast to the original model, this implementation 
#' can be used with general order-d Markov models.
#' \item We provide an improved compound Poisson approximation that
#' uses more appropriate statistical assumptions concerning
#' overlapping motif hits and that can be used with order-d
#' background models as well. The improved version is used by default
#' with method='kopp'.
#' Note: Only method='kopp' supports the computation
#' of the distribution of the number of motif hits w.r.t. scanning
#' a single DNA strand (see \code{\link{probOverlapHit}}).
#' }
#'
#'
#' @param seqlen Integer-valued vector that defines the lengths of the
#' individual sequences. For a given DNAStringSet, 
#' this information can be retrieved using \code{\link{numMotifHits}}.
#' @param overlap An Overlap object.
#' @param method String that defines which method shall be invoked: 'pape' or
#' 'kopp' (see description). Default: method = 'kopp'.
#'
#' @return List containing
#' \describe{
#' \item{dist}{Distribution of the number of hits}
#' }
#' @examples
#'
#' # Load sequences
#' seqfile = system.file("extdata", "seq.fasta", package = "motifcounter")
#' seqs = Biostrings::readDNAStringSet(seqfile)
#' 
#' # Load motif
#' motiffile = system.file("extdata", "x31.tab", package = "motifcounter")
#' motif = t(as.matrix(read.table(motiffile)))
#'
#' # Load background model
#' bg = readBackground(seqs, 1)
#'
#' # Use 100 individual sequences of length 150 bp each
#' seqlen = rep(150, 100)
#'
#' # Compute overlapping probabilities
#' # for scanning the forward DNA strand only
#' op = motifcounter:::probOverlapHit(motif, bg, singlestranded = TRUE)
#'
#' # Computes  the compound Poisson distribution
#' dist = motifcounter:::compoundPoissonDist(seqlen, op)
#' #plot(1:length(dist$dist)-1, dist$dist)
#'
#' # Compute overlapping probabilities
#' # for scanning the forward DNA strand only
#' op = motifcounter:::probOverlapHit(motif, bg, singlestranded = FALSE)
#' 
#' # Computes  the compound Poisson distribution
#' dist = motifcounter:::compoundPoissonDist(seqlen, op)
#' #plot(1:length(dist$dist)-1, dist$dist)
#'
#' @seealso \code{\link{combinatorialDist}}
#' @seealso \code{\link{probOverlapHit}}
#' @seealso \code{\link{numMotifHits}}
compoundPoissonDist = function(seqlen, overlap, method = "kopp") {
    stopifnot(is(overlap, "Overlap"))
    
    # Length must be at least as long as the motif
    sl = sum(vapply(seqlen, function(sl, ml) {
        sl - ml + 1
    }, FUN.VALUE = 0,
    ml = length(getBeta(overlap))))

    if (sl <= 0) {
        return (list(dist = 1))
    }
    
    # for all practical purposes, a maximal clump size of 60
    # should be enough
    maxclumpsize = 60
    
    # determine the max number of hits automatically from the
    # given sequence length.
    # even though it is a little bit of computational overhead,
    # set the maxhits to the total sequence length
    maxhits = sum(seqlen)
    dist = numeric(maxhits + 1)
    if (method == "kopp") {
        res = .C(
            motifcounter_compoundPoisson_useBeta,
            getAlpha(overlap),
            getBeta(overlap),
            getBeta3p(overlap),
            getBeta5p(overlap),
            as.numeric(dist),
            as.integer(length(seqlen)),
            as.integer(seqlen),
            as.integer(maxhits),
            as.integer(maxclumpsize),
            length(getBeta(overlap)),
            as.integer(getSinglestranded(overlap))
        )
        dist = res[[5]]
    } else if (method == "pape") {
        if (getSinglestranded(overlap) == TRUE) {
            stop(
                paste(strwrap(
                "method = 'pape' does not 
                support singlestranded = TRUE (see probOverlapHit).
                Please use method = 'kopp' instead."), collapse = "\n")
            )
        }
        res = .C(
            motifcounter_compoundPoissonPape_useGamma,
            getGamma(overlap),
            as.numeric(dist),
            as.integer(length(seqlen)),
            as.integer(seqlen),
            as.integer(maxhits),
            as.integer(maxclumpsize),
            length(getBeta(overlap))
        )
        dist = res[[2]]
        } else {
            stop(
                "Invalid 'method': The 'method' must be 'kopp' or 'pape'")
        }
    return (list(dist = dist))
}

#' Clump size distribution
#'
#' This function approximates the distribution of the clump sizes.
#' 
#' The clump size distribution can be determined in two alternative ways:
#' \enumerate{
#' \item A re-implemented version of the algorithm that was
#' described in Pape et al. \emph{Compound poisson approximation
#' of the number of occurrences of a position
#' frequency matrix (PFM) on both strands.} 2008
#' can be invoked using method='pape'.
#' \item An improved approximation of the clump size distribution 
#' uses more appropriate statistical assumptions concerning
#' overlapping motif hits and that can be used with order-d
#' background models as well. The improved version is used by default
#' with method='kopp'.
#' }
#'
#'
#' @inheritParams compoundPoissonDist
#' @param maxclump Maximal clump size
#'
#' @return List containing
#' \describe{
#' \item{dist}{Distribution of the clump size}
#' }
#' @examples
#'
#' # Load sequences
#' seqfile = system.file("extdata", "seq.fasta", package = "motifcounter")
#' seqs = Biostrings::readDNAStringSet(seqfile)
#' 
#' # Load motif
#' motiffile = system.file("extdata", "x31.tab", package = "motifcounter")
#' motif = t(as.matrix(read.table(motiffile)))
#'
#' # Load background model
#' bg = readBackground(seqs, 1)
#'
#' # Use 100 individual sequences of length 150 bp each
#' seqlen = rep(150, 100)
#'
#'
#' # Compute overlapping probabilities
#' # for scanning the forward DNA strand only
#' op = motifcounter:::probOverlapHit(motif, bg, singlestranded = FALSE)
#' 
#' # Computes  the compound Poisson distribution
#' dist = motifcounter:::clumpSizeDist(20, op)
#'
#' @seealso \code{\link{probOverlapHit}}
clumpSizeDist = function(maxclump, overlap, method = "kopp") {
    stopifnot(is(overlap, "Overlap"))
    stopifnot(getSinglestranded(overlap) == FALSE)
    
    dist = numeric(maxclump)
    if (method == "kopp") {
        res = .C(
            motifcounter_clumpsize_kopp,
            getBeta(overlap),
            getBeta3p(overlap),
            getBeta5p(overlap),
            as.numeric(dist),
            as.integer(maxclump),
            length(getBeta(overlap))
        )
        dist = res[[4]]
    } else if (method == "pape") {
        res = .C(
            motifcounter_clumpsize_pape,
            getGamma(overlap),
            as.numeric(dist),
            as.integer(maxclump),
            length(getBeta(overlap))
        )
        dist = res[[2]]
    } else {
        stop(
            "Invalid 'method': The 'method' must be 'kopp' or 'pape'")
    }
    return (list(dist = dist))
}