R/kmer_distribution.R

Defines functions get_kmer_distribution get_reads_num generate_paired_reads

Documented in generate_paired_reads get_kmer_distribution get_reads_num

##' Calculates the k-mer distribution of a given sequence
##'
##' Calculates the k-mer distribution of a given sequence
##' @title Get the k-mer distribution from a sequence 
##' @param seq Sequence of characters 
##' @param kmers vector of wanted k-mers
##' @return data.frame with probability of the k-mers
##' @author Jochen Kruppa
##' @export
get_kmer_distribution <- function(seq, kmers = c(1,2,3,4,5)){
  p_load(plyr, Biostrings, ShortRead)
  kmerDistList <- llply(kmers, function(kmer){
    countDf <- as.data.frame(oligonucleotideFrequency(DNAStringSet(seq), kmer))
    denom <- apply(countDf, 1, sum)
    percentageDf <- countDf/ifelse(denom == 0, 1, denom)
    return(percentageDf)
  }, .parallel = FALSE)
  kmerDistDf <- do.call(cbind, kmerDistList)
  return(kmerDistDf)
}

##' The function calculates the needed reads given a coverage and the
##' genome size
##'
##' The calulation is done by the formula of Lander and Waterman (1988).
##' @title Calculate number of reads given the coverage and genome
##'     size
##' @param coverage Wanted coverage
##' @param readLength Given read length
##' @param genomeSize Size of the genome in bases
##' @return numeric, number of reads
##' @author Jochen Kruppa
##' @references Lander, E. S. and Waterman, M. S. (1988). Genomic
##'     mapping by fingerprinting random clones: A mathematical
##'     analysis. Genomics, 2(3), 231-239.
##' @export
get_reads_num <- function(coverage, readLength, genomeSize) {
    floor(coverage * genomeSize / readLength)
}

##' The function generates two files with paired reads 
##'
##' The generated files (R1 and R2) are generated by drawing the
##' wanted reads from a given initial genome sequence
##' @title Generate paired reads from a given genome
##' @param numReads Number of wanted reads
##' @param initSequence Original genome sequence to draw reads from
##' @param readLength Wanted read length in bases
##' @param readMeanDist Mean distance between the paired reads
##'     (default = 50)
##' @param files Vector of two named files ("R1", "R2") to save the
##'     generated reads
##' @param parallel_flag Should everything done in parallel (not
##'     recommended).
##' @return NULL
##' @author Jochen Kruppa
##' @export
generate_paired_reads <- function(numReads = 10, initSequence, readLength = 150,
                                 readMeanDist = 50, files, parallel_flag = FALSE) {
  p_load(plyr, Biostrings, ShortRead)
  ## get the reads
    l_ply(1:numReads, function(...) {
        ## distance of the paired reads
        readDist <- rpois(1, readMeanDist)
        ## read total range
        readRange <- 2*readLength + readDist
        maxValue <- width(initSequence) - readRange
        ## get the read start positions
        startPosR1 <- sample(1:maxValue, 1)
        startPosR2 <- startPosR1 + readLength + readDist
        ## R1 is easy...
        R1 <- subseq(initSequence, startPosR1, startPosR1 + readLength - 1)
        names(R1) <- "R1"
        ## R2 with the reverse sequence and the complement bases
        R2pre <- subseq(initSequence, startPosR2, startPosR2 + readLength - 1)
        R2 <- complement(Biostrings::reverse(R2pre)) ## complement of the sequence
        names(R2) <- "R2"
        ## write out both reads to separate files
        writeFasta(R1, files[["R1"]], mode = "a")
        writeFasta(R2, files[["R2"]], mode = "a")
    }, .parallel = parallel_flag)
}
jkruppa/acgtPyramid documentation built on May 19, 2019, 12:45 p.m.