R/extract.R

##' Extract FastQC
##'
##' Return the number of sequences per entry in a set of zip archives generated by FastQC.
##' To be used after readFastqcZips().
##' @param all.qc return value from readFastqcZips()
##' @return numeric vector
##' @author Timothee Flutre [cre,aut]
##' @export
extractFastqcNreads <- function(all.qc){
  stopifnot(is.list(all.qc), ! is.null(names(all.qc)))
  sapply(all.qc, function(qc){
    as.numeric(qc[["Basic_Statistics"]]$Value[qc[["Basic_Statistics"]]$Measure
                                              == "Total Sequences"])
  })
}

##' Extract FastQC
##'
##' Return the number, or percentage, of sequences per quality score per entry in a set of zip archives generated by FastQC.
##' To be used after readFastqcZips().
##' @param all.qc return value from readFastqcZips()
##' @param perc return percentage of sequences if TRUE, number of sequences otherwise
##' @param nreads return value from nreads.fastqc(), required if perc=TRUE
##' @return numeric matrix with entries in rows and number (percentage) of sequences per quality in columns
##' @author Timothee Flutre [cre,aut]
##' @export
extractFastqcQuals <- function(all.qc, perc=FALSE, nreads=NULL){
  stopifnot(is.list(all.qc), ! is.null(names(all.qc)),
            ifelse(perc, ! is.null(nreads), TRUE))
  N <- length(all.qc)
  qual <- matrix(NA, nrow=N, ncol=50,
                 dimnames=list(names(all.qc), paste0("Q=", 1:50)))
  for(i in 1:N)
    qual[i, all.qc[[i]][["Per_sequence_quality_scores"]][,"Quality"]] <-
      all.qc[[i]][["Per_sequence_quality_scores"]][,"Count"]
  if(perc)
    for(i in 1:nrow(qual))
      qual[i,] <- (qual[i,] / nreads[i]) * 100
  return(qual)
}

##' Extract FastQC
##'
##' Returns the adapter content along the sequences per entry in a set of zip archives generated by FastQC.
##' To be used after readFastqcZips().
##' When reads are long, FastQC can average adapter content over a given range, e.g. over two successive bases, such as "11-12".
##' In such a case, because the ranges can be different for different fastq files in \code{all.qc}, this function will set adapter content to the first base of the range in the ouput, here "11", the other, here "12", will get NA.
##' @param all.qc return value from readFastqcZips()
##' @param adp name of the adapter to plot (default="Illumina Universal Adapter")
##' @return numeric matrix with entries in rows and positions along sequences in columns
##' @author Timothee Flutre [cre,aut], Nicolas Rode [ctb]
##' @export
extractFastqcAdpContents <- function(all.qc, adp="Illumina Universal Adapter"){
  stopifnot(is.list(all.qc), ! is.null(names(all.qc)))

  ## set up the structure of the output
  N <- length(all.qc)
  L <- -1
  for(i in 1:N){
    nb.pos <- length(all.qc[[i]]$Adapter_Content[["Position"]])
    L.i <- as.numeric(
        strsplit(
            as.character(all.qc[[i]]$Adapter_Content[["Position"]][nb.pos]),
            "-")[[1]][1])
    L <- ifelse(L.i > L, L.i, L)
  }
  adp.content <- matrix(data=NA, nrow=N, ncol=L,
                        dimnames=list(names(all.qc), as.character(1:L)))

  ## fill the output
  for(i in 1:N){
    stopifnot(adp %in% names(all.qc[[i]]$Adapter_Content))
    first.positions <- sapply(
        strsplit(as.character(all.qc[[i]]$Adapter_Content[["Position"]]),
                 "-"), function(x){x[[1]][1]})
    adp.content[i, first.positions] <- all.qc[[i]]$Adapter_Content[[adp]]
  }

  return(adp.content)
}

##' Extract FastQC
##'
##' Returns the N count along the sequences per entry in a set of zip archives generated by FastQC.
##' To be used after readFastqcZips().
##' When reads are long, FastQC can average N count over a given range, e.g. over two successive bases, such as "11-12".
##' In such a case, because the ranges can be different for different fastq files in \code{all.qc}, this function will set N count to the first base of the range in the ouput, here "11", the other, here "12", will get NA.
##' @param all.qc return value from readFastqcZips()
##' @return numeric matrix with entries in rows and positions along sequences in columns
##' @author Timothee Flutre [cre,aut]
##' @export
extractFastqcBaseNs <- function(all.qc){
  stopifnot(is.list(all.qc), ! is.null(names(all.qc)))

  ## set up the structure of the output
  N <- length(all.qc)
  L <- -1
  for(i in 1:N){
    nb.pos <- length(all.qc[[i]]$Per_base_N_content[,"Base"])
    L.i <- as.numeric(
        strsplit(
            as.character(all.qc[[i]]$Per_base_N_content[,"Base"][nb.pos]),
            "-")[[1]][1])
    L <- ifelse(L.i > L, L.i, L)
  }
  baseN <- matrix(data=NA, nrow=N, ncol=L,
                  dimnames=list(names(all.qc), as.character(1:L)))

  ## fill the output
  for(i in 1:N){
    first.positions <- as.numeric(sapply(
        strsplit(as.character(all.qc[[i]]$Per_base_N_content[,"Base"]), "-"),
        function(x){x[[1]][1]}))
    baseN[i, first.positions] <- as.numeric(all.qc[[i]]$Per_base_N_content[,"N-Count"])
  }

  return(baseN)
}

##' Extract FastQC
##'
##' Returns the sequence length distribution per entry in a set of zip archives generated by FastQC.
##' To be used after readFastqcZips().
##' @param all.qc return value from readFastqcZips()
##' @return numeric matrix with entries in rows and positions along sequences in columns
##' @author Timothee Flutre [cre,aut], Nicolas Rode [ctb]
##' @export
extractFastqcSeqLengths <- function(all.qc){
  stopifnot(is.list(all.qc), ! is.null(names(all.qc)))
  N <- length(all.qc)
  positions <- lapply(1:N, function(i){
    as.character(all.qc[[i]]$Sequence_Length_Distribution[["Length"]])
  })
  positions <- unique(sort(sapply(
      strsplit(do.call(c, positions), "-"), function(x){
        as.numeric(x[1])
      })))
  seq.lengths <- matrix(0, nrow=N, ncol=length(positions),
                        dimnames=list(names(all.qc), positions))
  for(i in 1:N){
    tmp <- all.qc[[i]]$Sequence_Length_Distribution
    if (length(grep("-",tmp$Length)) != 0)
      tmp$Length <- sapply(strsplit(tmp$Length, "-"), function(x){x[1]})
    seq.lengths[i,as.character(tmp$Length)] <- tmp$Count
  }
  return(seq.lengths)
}
timflutre/rutilsfastqc documentation built on May 31, 2019, 2:16 p.m.