R/sequence.R

Defines functions guessEncoding phredScore char2ascii

#' Calculate ascii for chars
.char2ascii <- function( qualityChars) {
  strtoi(sapply(qualityChars, charToRaw),16L)
}

#' @param qualityChar A vector of quality chars
#'
#' Calculate Phred quality score from base-calling error probabilites P
#'
#' @param P A vector of probabilites
#' @return Q A vector of Phred Score

#' @examples
#' P <- c(0, .05, .01, .001, .0001)
#' phredScore(P)
phredScore <- function(P) {
  if( any(P > 1 | P < 0) ) {
    stop("Base-calling error probablites 'P' must be within 0 to 1")
  }

  Q <- -10 * log10(P)
  Q <- round(Q, digits = 0)
  Q

}

#' Guess fastq file quality encoding
#'
#' @param fastqFile Single file name of FASTQ files to be read
#' @param lines How many lines are used to guess encoding
#' @return Sanger , Illumina 1.3, Illumina 1.5 or Illumina 1.8
#' @examples
#' fl <- system.file(package="ShortRead", "extdata", "E-MTAB-1147","ERR127302_1_subset.fastq.gz")
#' guessEncoding(fl)
#'
#' @import ShortRead
#' @import Biostrings


guessEncoding <- function(fastqFile, lines = 1000) {
  asciiRange <- list(Sanger = c(33,73),
                     Illumina_1.3 = c(64,104),
                     Illumina_1.5 = c(67,104),
                     Illumina_1.8 = c(33,74))
  rfq <- FastqSampler(fastqFile, lines)
  qstring <- quality(yield(rfq))
  qualityChars <- names(encoding(qstring))
  qualityAscii <- .char2ascii(qualityChars)
  obsRange <- c(min(qualityAscii), max(qualityAscii))
  encodingIdx <- sapply(asciiRange, `[`,1) == obsRange[1] &
    sapply(asciiRange, `[`, 2) == obsRange[2]
  if ( !any(encodingIdx) ) {
    warning("The observed Phred score range is ", obsRange[1], "-",
            obsRange[2], ", Return NA since encoding can not be identified")
    return(NA)
  }

  obsEncoding <- names(asciiRange[encodingIdx])
  if (length(obsEncoding) > 1 ) {
    stop("ERROR, ambigous encoding identified")
  }

  obsEncoding
}
ahy1221/abort documentation built on Aug. 19, 2017, 12:14 a.m.