inst/Scripts/simulate.R

## quality scores needs to be a 3way array,
## you can create one using something like this
## 
# sp <- SolexaPath("/mnt/fred/solexa/ycao/080623_HWI-EAS88_0001")
# rfq <- readFastq(analysisPath(sp), pattern="s_1_sequence.txt")
# abc <- alphabetByCycle(quality(rfq))
## this gives a three way array, rows areq


seqQScores <- function(inScores, abc) {
    ##set up the quality scores
    ##we need to translate to integer from ascii,
    ##then average the integer codes and translate back
    q = rawToChar(as.raw(32:(32+93))) # these are valid quality encodings
    intQ = as(SFastqQuality(q), "numeric") + 33L # actually, 'integer' :(

    ##we start with a 3-way array (letter by quality by position)
    ##and drop to a two-way (letter by position) containing the
    ##average quality for that letter and position
    x1 = apply(abc, c(1,3), function(x)
               if(sum(x) > 0) round(sum(x*intQ)/sum(x)) else 0)

    ##drop all the letters with zero scores
    x1 = x1[rowSums(x1) != 0, ]
    ##fix up the N's in the first 12, as they will have scores of
    ##zero, since Solexa requires the first 12 to be unambiguous
    ## calls
    x1[5, 1:12] = 36

    ##and now, translate back from integer codes to ascii
    ss = strsplit(q, "")[[1]]
    x2 = ss[x1]
    dim(x2) = dim(x1)
    dimnames(x2) = dimnames(x1)
    x2
}

## assume size is a named vector with names correspond to chromosomes
## whose values are the number of reads to simulate from that chromosome,
## genome is the name of a genome, eg Mmusculus, assume BSgenome
## is loaded, 
## readLength is the length of the reads
## qualityScores is a matrix of 5 (ACGTN) by number of cycles
## replace is a logical for sampling with replacement
simulateReads <- function(size, genome, readLength, qualityScores,
                          replace=FALSE, verbose=FALSE) {
    if(ncol(qualityScores) != readLength)
        stop("ncol(qualityScores) != readLength")

    if(all(size < 1))
        size = size * seqlengths(genome)[names(size)]
    simValues =
      lapply(seq_len(length(size)), function(i) {
          if(verbose) cat("Iteration: ", i, " ", sep="")
          chr = names(size)[i]
          seq = unmasked(genome[[chr]])
          if(verbose) cat(".")
          chrlen = seqlengths(genome)[[chr]] - readLength
          starts = sample(chrlen, size[i], replace=replace)
          if(verbose) cat(".")
          simReads =
            as.character(Views(seq, IRanges(start=starts, width=readLength)))
          if(verbose) cat(".done\n")
          return(list(sread = simReads, id = paste(chr, starts, sep = ",")))
      })
    simReads = unlist(lapply(simValues, "[[", "sread"))
    simQ =
      do.call(paste,
              c(lapply(seq_len(readLength), function(i)
                       qualityScores[substring(simReads, i, i), i]),
                sep=""))
    return(ShortReadQ(sread = DNAStringSet(simReads), 
                      quality = SFastqQuality(BStringSet(simQ)),
                      id = BStringSet(unlist(lapply(simValues, "[[", "id")))))
}
Bioconductor/chipseq documentation built on Oct. 29, 2023, 5:04 p.m.