seqateurs: Tools for trimming and processing sequence data

Documented in align_overlap n_overlap plot_lengths plot_maxEE plot_quality

# Get Qual Stats ----------------------------------------------------------

#' Get qual stats
#'
#' @param input
#' @param n The number of reads to sample
#'
#' @return
#' @export
#'
#' @examples
get_qual_stats <- function (input, n = 5e+05) {

  statdf <- data.frame(Cycle = integer(0), Mean = numeric(0),
                       Q25 = numeric(0), Q50 = numeric(0), Q75 = numeric(0),
                       Cum = numeric(0), file = character(0))
  anndf <- data.frame(minScore = numeric(0), label = character(0),
                      rclabel = character(0), rc = numeric(0), file = character(0))
  #Check files arent empty
  fl <- input[file.size(input) > 28]
  if (length(input) > length(fl)) {
    message("Warning, the following files were empty: \n")
    message(paste0(input[!input %in% fl], " \n"))
  }
  # Start loop
  FIRST <- TRUE
  for (f in fl[!is.na(fl)]) {
    fq <- FastqFile(f)
    rc <- countFastq(f)$records
    close(fq)
    coll <- QACollate(QAFastqSource(f, n=n), QAQualityByCycle())
    srqa <- qa2(coll, BPPARAM=SerialParam(), verbose=FALSE)
    df <- data.frame(srqa$QAQualityByCycle@values)
    if (rc >= n) {
      rclabel <- paste("Reads >= ", n)
    }  else {      rclabel <- paste("Reads: ", rc)
    }
    means <- rowsum(df$Score * df$Count, df$Cycle)/rowsum(df$Count,  df$Cycle)

    get_quant <- function(xx, yy, q) {
      xx[which(cumsum(yy)/sum(yy) >= q)][[1]]
    }
    q25s <- by(df, df$Cycle, function(x) get_quant(x$Score, x$Count, 0.25), simplify = TRUE)
    q50s <- by(df, df$Cycle, function(x) get_quant(x$Score, x$Count, 0.5), simplify = TRUE)
    q75s <- by(df, df$Cycle, function(x) get_quant(x$Score, x$Count, 0.75), simplify = TRUE)
    cums <- by(df, df$Cycle, function(x) sum(x$Count), simplify = TRUE)
    if (!all(sapply(list(names(q25s), names(q50s), names(q75s), names(cums)), identical, rownames(means)))) {
      stop("Calculated quantiles/means weren't compatible.")
    }
    if (FIRST) {
      plotdf <- cbind(df, file = basename(f))
      FIRST <- FALSE
    } else {
      plotdf <- rbind(plotdf, cbind(df, file = basename(f)))
    }

    statdf <- rbind(statdf, data.frame(Cycle = as.integer(rownames(means)),
                                       Mean = means, Q25 = as.vector(q25s), Q50 = as.vector(q50s),
                                       Q75 = as.vector(q75s), Cum = 10 * as.vector(cums)/rc,
                                       file = basename(f)))
    anndf <- rbind(anndf, data.frame(minScore = min(df$Score),
                                     label = basename(f), rclabel = rclabel, rc = rc,
                                     file = basename(f)))
  }

  anndf$minScore <- min(anndf$minScore)

  get_ee <- function(Q){
    ee <- 10^(-Q/ 10)
    return(ee)
  }


  plotdf.summary <- aggregate(Count ~ Cycle + Score, plotdf, sum)
  plotdf.summary$label <- paste(nrow(anndf), "files (aggregated)")
  means <- rowsum(plotdf.summary$Score * plotdf.summary$Count, plotdf.summary$Cycle)/rowsum(plotdf.summary$Count, plotdf.summary$Cycle)
  EEmeans <- get_ee(rowsum(plotdf.summary$Score * plotdf.summary$Count, plotdf.summary$Cycle)/rowsum(plotdf.summary$Count, plotdf.summary$Cycle))
  q10s <- by(plotdf.summary, plotdf.summary$Cycle, function(x) get_quant(x$Score, x$Count, 0.1), simplify = TRUE)
  EE10s <- by(plotdf.summary, plotdf.summary$Cycle, function(x) get_ee(get_quant(x$Score, x$Count, 0.1)), simplify = TRUE)
  q25s <- by(plotdf.summary, plotdf.summary$Cycle, function(x) get_quant(x$Score, x$Count, 0.25), simplify = TRUE)
  EE25s <- by(plotdf.summary, plotdf.summary$Cycle, function(x) get_ee(get_quant(x$Score, x$Count, 0.25)), simplify = TRUE)
  q50s <- by(plotdf.summary, plotdf.summary$Cycle, function(x) get_quant(x$Score, x$Count, 0.5), simplify = TRUE)
  EE50s <- by(plotdf.summary, plotdf.summary$Cycle, function(x) get_ee(get_quant(x$Score, x$Count, 0.50)), simplify = TRUE)
  q75s <- by(plotdf.summary, plotdf.summary$Cycle, function(x) get_quant(x$Score, x$Count, 0.75), simplify = TRUE)
  EE75s <- by(plotdf.summary, plotdf.summary$Cycle, function(x) get_ee(get_quant(x$Score, x$Count, 0.75)), simplify = TRUE)
  q90s <- by(plotdf.summary, plotdf.summary$Cycle, function(x) get_quant(x$Score, x$Count, 0.9), simplify = TRUE)
  EE90s <- by(plotdf.summary, plotdf.summary$Cycle, function(x) get_ee(get_quant(x$Score, x$Count, 0.9)), simplify = TRUE)
  cums <- by(plotdf.summary, plotdf.summary$Cycle, function(x) sum(x$Count), simplify = TRUE)
  statdf.summary <- data.frame(Cycle = as.integer(rownames(means)),
                               QMean = means,
                               EEmean = EEmeans,
                               Q10 = as.vector(q10s),
                               EE10 = as.vector(EE10s),
                               Q25 = as.vector(q25s),
                               EE25 = as.vector(EE25s),
                               Q50 = as.vector(q50s),
                               EE50 = as.vector(EE50s),
                               Q75 = as.vector(q75s),
                               EE75 = as.vector(EE75s),
                               Q90 = as.vector(q90s),
                               EE90 = as.vector(EE90s),
                               reads = 10 * as.vector(cums)/rc)
  attr(statdf.summary, "qsummary") <- "qsummary"
  return(statdf.summary)
}


# Qual plot ---------------------------------------------------------------

#' Quality plot
#'
#' @param input
#' @param n The number of reads to sample
#'
#' @return
#' @export
#' @import ggplot2
#' @import dplyr
#' @import tidyr
#'
#' @examples
plot_quality <- function(input, n = 5e+05) {
  summary <- get_qual_stats(input, n = n )
  plot <- summary %>%
    dplyr::select(Cycle, reads, starts_with("Q")) %>%
    tidyr::pivot_longer(cols = starts_with("Q")) %>%
    ggplot(aes(x=Cycle, y=value, colour=name)) +
    #geom_point(size=1)  +
    geom_line(data = summary, aes(y = QMean), color = "#66C2A5") +
    geom_line(data = summary, aes(y = Q25), color = "#FC8D62", size = 0.25, linetype = "dashed") +
    geom_line(data = summary, aes(y = Q50), color = "#FC8D62", size = 0.25) +
    geom_line(data = summary, aes(y = Q75), color = "#FC8D62",  size = 0.25, linetype = "dashed") +
    #geom_tile(aes(fill = reads))
    labs(x = "Reads position", y = "Quality Score") +
    ylim(c(0, NA))
  return(plot)
}


#' MaxEE plot
#'
#' @param input
#' @param n
#'
#' @return
#' @export
#' @import ggplot2
#' @import dplyr
#' @import tidyr
#'
#' @examples
plot_maxEE <- function(input, n = 5e+05) {
  summary <- get_qual_stats(input, n = n )
  plot <- summary %>%
    dplyr::select(Cycle, starts_with("EE")) %>%
    tidyr::pivot_longer(cols = starts_with("EE")) %>%
    dplyr::group_by(name) %>%
    dplyr::mutate(cumsumEE = cumsum(value)) %>%
    ggplot(aes(x=Cycle, y=log10(cumsumEE), colour=name)) +
    geom_point(size=1) +
    geom_hline(yintercept = log10(1), color = "red") +
    geom_hline(yintercept = log10(2), color = "red") +
    geom_hline(yintercept = log10(3), color = "red") +
    geom_hline(yintercept = log10(5), color = "red") +
    geom_hline(yintercept = log10(7), color = "red") +
    geom_text(label = "MaxEE=1", aes(x = 0, y = log10(1), hjust = 0, vjust = 0), color = "red") +
    geom_text(label = "MaxEE=2", aes(x = 0, y = log10(2), hjust = 0, vjust = 0), color = "red") +
    geom_text(label = "MaxEE=3", aes(x = 0, y = log10(3), hjust = 0, vjust = 0), color = "red") +
    geom_text(label = "MaxEE=5", aes(x = 0, y = log10(5), hjust = 0, vjust = 0), color = "red") +
    geom_text(label = "MaxEE=7", aes(x = 0, y = log10(7), hjust = 0, vjust = 0), color = "red") +
    labs(x = "Reads position", y = "Log10 Cumulative expected errors")
  return(plot)
}


#' Plot readlengths of fastq
#'
#' @param dir A directory of fastq files
#' @param aggregate Whether plot should be faceted by sample or aggregated together
#' @param sample How many reads to sample from each fastq, default 1e6
#'
#' @return
#' @export
#' @import ShortRead
#' @import magrittr
#' @import dplyr
#' @import tibble
#' @import tidyr
#' @import ggplot2
#'
#' @examples
plot_lengths <- function(dir, aggregate = FALSE, sample = 1e6){
  Forward <-  sort(list.files(dir, pattern="_R1_", full.names = TRUE))
  Reverse <-  sort(list.files(dir, pattern="_R2_", full.names = TRUE))

  lengths <- vector("list", length=length(Forward))
  for(i in 1:length(Forward)){
    #Forwrard reads
    sampler <- ShortRead::FastqSampler(Forward[[i]], n=sample )
    Ffq <- as.data.frame(table(width(ShortRead::yield(sampler))), stringsAsFactors=FALSE) %>%
      magrittr::set_colnames(c("Readlength", "Forward")) %>%
      dplyr::mutate_if(is.character, as.integer)
    #Reverse reads
    sampler <- ShortRead::FastqSampler(Reverse[[i]], n=sample )
    Rfq <- as.data.frame(table(width(ShortRead::yield(sampler))), stringsAsFactors=FALSE) %>%
      magrittr::set_colnames(c("Readlength", "Reverse"))%>%
      dplyr::mutate_if(is.character, as.integer)

    lengths[[i]] <- tibble::tibble(Readlength=seq(1, max(max(Ffq$Readlength), max(Rfq$Readlength)),1)) %>%
      dplyr::left_join(Ffq, by="Readlength") %>%
      dplyr::left_join(Rfq, by="Readlength")
    lengths[[i]]$source <- Forward[i]
  }

  p <- do.call(rbind, lengths) %>%
    dplyr::group_by(Readlength, source) %>%
    dplyr::summarise(Forward = sum(Forward), Reverse = sum(Reverse)) %>%
    tidyr::pivot_longer(cols=3:4,
                 names_to = "Direction",
                 values_to = "Count") %>%
    ggplot(aes(x=Readlength, y=Count, group=Direction, fill=Direction)) +
    geom_bar(stat="identity", colour="black")

  if(aggregate==TRUE){
    p <- p  + facet_grid(~Direction)
  } else {
    p <- p  + facet_grid(source~Direction)
  }
  return(p)
}



# estimate_trunclen ------------------------------------------------------------

#' Automatically estimate trunclen for filtering
#'
#' @param fwd A path to a fastq file containing forward reads
#' @param rev A path to a fastq file containing reverse reads
#' @param threshold A Quality threshold used to signal the threshold to truncate the reads at
#' @param maxlength The maximum length for the reads to be trimmed at. Suggested length of amplicon if known.
#' @param minlength The minimum length for reads to be trimmed at to allow merging.
#' @param qa_sample The number of reads to sample for calculating qualities.
#' @param overlap_sample The number of reads to sample for aligning and estimating minlength
#' @param quiet Whether progress should be printed to console
#'
#' @return
#' @export
#'
#' @examples
estimate_trunclen <- function (fwd, rev = NULL, threshold = 25, maxlength, minlength, minoverlap=20, qa_sample = 5e+05, overlap_sample = 100, quiet = FALSE) {
  fqa <- get_qual_stats(fwd, n = qa_sample)
  if (!is.null(rev)) {
    rqa <- get_qual_stats(rev, n = qa_sample)
  }
  if (missing(maxlength)) {
    maxlength <- max(max(fqa$Cycle), max(rqa$Cycle))
    message(paste0("Maxlength is emply, setting to ",
                   maxlength, " bp"))
  }
  if (missing(minlength) & !is.null(rev)) {
    overlap <- unlist(mapply(n_overlap, fwd, rev, sample = overlap_sample))
    ux <- unique(overlap)
    tab <- tabulate(match(overlap, ux))
    overlap_mode <- ux[tab == max(tab)]
    readlength <- min(max(fqa$Cycle), max(rqa$Cycle))

    # Min-trunclength
    minlength <- readlength - (overlap_mode/2 - minoverlap)
    message(paste0("Minlength is empty, estimated minimum trunlen of ", minlength, "bp to maintain ", minoverlap, "bp overlap between reads"))
  }  else if (missing(minlength) & is.null(rev)) {
    minlength <- 0
    warning(paste0("Minlength is empty and has been set to ",
                   minlength, " bp"))
  }
  fwd_trunc <- suppressWarnings(fqa %>% filter(QMean < threshold) %>% pull(Cycle) %>%
                                  min())
  rev_trunc <- suppressWarnings(rqa %>% filter(QMean < threshold) %>% pull(Cycle) %>%
                                  min())
  if (fwd_trunc > maxlength) {
    fwd_trunc <- maxlength
  }
  if (rev_trunc > maxlength) {
    rev_trunc <- maxlength
  }
  if (fwd_trunc < minlength) {
    fwd_trunc <- minlength
  }
  if (rev_trunc < minlength) {
    rev_trunc <- minlength
  }
  out <- c(fwd_trunc, rev_trunc)
  return(out)
}


#' Minimum overlap between a query and reference
#'
#' @param query
#' @param ref
#'
#' @return
#' @export
#' @importFrom dada2 nwalign
#'
#' @examples
align_overlap <- function(query, ref) {
  al <- dada2::nwalign(query, ref)
  fwd_gaps <- length(gregexpr("-", al[[1]])[[1]])
  rev_gaps <- length(gregexpr("-", al[[2]])[[1]])

  overlap <- nchar(al[[1]]) - (fwd_gaps + rev_gaps)
  return(overlap)
}


#' Number of bases overlapping between forward and reverse reads
#'
#' @param fwd A path to a fastq file containing forward reads
#' @param rev A path to a fastq file containing reverse reads
#' @param sample The number of reads to sample from the fastq files
#'
#' @return
#' @export
#' @import ShortRead
#' @import Biostrings
#'
#' @examples
n_overlap <- function(fwd, rev, sample = 100) {

  #Sample reads
  fF <- ShortRead::FastqStreamer(fwd, n=sample)
  on.exit(close(fF))
  set.seed(123L);fqF <- ShortRead::yield(fF)

  fR <- ShortRead::FastqStreamer(rev, n=sample)
  on.exit(close(fR), add=TRUE)
  set.seed(123L);fqR <- ShortRead::yield(fR)

  #Get reads where both contain no N's
  keep <- (Biostrings::alphabetFrequency(sread(fqF), baseOnly=TRUE)[,"other"] <= 0
           ) & (Biostrings::alphabetFrequency(sread(fqR), baseOnly=TRUE)[,"other"] <= 0)

  fqF <- fqF[keep]
  fqR <- fqR[keep]

  #Check ids match
  idF <- ShortRead::trimTails(ShortRead::id(fqF), 1, " ")
  idR <- ShortRead::trimTails(ShortRead::id(fqR), 1, " ")

  if(!all(idF == idR)){stop("Error: paired end reads dont match")}

  #Get overlap
  overlap <- mapply(align_overlap, as.character(sread(fqF)),  rc(as.character(sread(fqR))))
  overlap <- as.numeric(overlap)
  return(overlap)
}