#' @title Summarising statistics of paired-end read sets
#' @description This function summarises statistics from seqkit for paired-end read sets. To use this function, the input TSV file must
#' be generated using command 'seqkit stats --all'.
#' @param tsv Tab-delimited output (a TSV file) from command 'seqkit stats'
#' @param header Column names matching seqkit stats's output
#' @param ref_len Length of the reference genome in base pairs
#' @param ext Filename extension of FASTQ files in the input TSV file. For example, '.fastq.gz' or '.fq.gz'.
#' @param suf_R A logical value indicating whether 'R' is used in the filename suffices. For instance, suf_R = TRUE when read files are ended with '_R1.fastq.gz' and '_R2.fastq.gz'.
#' @param sort_by_name A string with values "decreasing", "increasing (default), or NULL indicating whether the output data frame will be sorted by isolate names in a specific order. This argument overrides "sort_by_depth" if the former is not NULL.
#' @param sort_by_depth A string with values "decreasing", "increasing", or NULL (no sorting) indicating whether the output data frame will be sorted in a specific order for sequencing depths and isolate names.
#' @return A data frame with summary statistics including sequencing depths.
#' @author Yu Wan <wanyuac@@gmail.com>
#' @export
# (C) Copyright 2023 Yu Wan <wanyuac@gmail.com>
# Licensed under the Apache License, Version 2.0
# Release: 4 Jan 2023; last update: 30 May 2024.
summariseSeqkitPEReadStats <- function(tsv,
header = c("file", "format", "type", "num_seqs", "sum_len", "min_len",
"avg_len", "max_len", "Q1", "Q2", "Q3", "sum_gap", "N50",
"N50_num", "Q20_perc", "Q30_perc", "AvgQual", "GC"),
ref_len = 5e6,
ext = ".fastq.gz",
suf_R = FALSE,
sort_by_name = "increasing",
sort_by_depth = NULL) {
rs <- read.delim(file = tsv, stringsAsFactors = FALSE)
header[1] <- "file" # Fix the name of the first column for the ease of coding
names(rs) <- header
rs$file <- gsub(pattern = ext, replacement = "", x = basename(rs$file), fixed = TRUE) # Remove filename extensions (e.g., ".fastq.gz")
trim_len <- ifelse(suf_R, 3, 2) # "_R[1,2]", then set suf_R = TRUE; otherwise ("_[1,2]"), set suf_R = FALSE (default)
rs <- rs[order(rs$file, decreasing = FALSE), ] # Ensure _1 always goes before _2 for the same isolate
rs$file <- sapply(rs$file, function(x) substr(x, start = 1, stop = nchar(x) - trim_len), simplify = TRUE, USE.NAMES = FALSE) # Remove "_1", "_2" or "_R1", "_R2" from column "file"
isolates <- unique(rs$file) # Isolate names
n <- length(isolates) # Number of isolates
out <- do.call(rbind.data.frame, lapply(isolates, .perIsolateStats, rs, ref_len))
m <- sum(is.na(out$Sum_len))
if (m > 0) {
print(paste("Warning: read sets of", m, "out of", n, "isolates could not be summarised.", sep = " "))
} else {
print(paste("All read sets of", n, "isolates have been summarised.", sep = " "))
}
if (! is.null(sort_by_name)) {
out <- out[order(out$Isolate, decreasing = (sort_by_name == "decreasing")), ]
} else if (! is.null(sort_by_depth)) {
out <- out[order(out$Depth, out$Isolate, decreasing = (sort_by_depth == "decreasing")), ]
}
rownames(out) <- NULL
return(out)
}
.perIsolateStats <- function(i, rs, ref_len) {
r <- subset(rs, file == i) # Returns two rows for paired-end reads
if (nrow(r) == 2) {
total_bases <- sum(r$sum_len)
total_reads <- sum(r$num_seqs)
# Note on 9 Apr 2024: removed 'N50 = paste(r$N50, collapse = ";")' and 'Q3_len = paste(r$Q3, collapse = ";")' as they have become optional in new seqkit versions.
out <- data.frame(Isolate = i,
Read_num = total_reads,
Sum_len = total_bases,
Max_len = paste(r$max_len, collapse = ";"),
Q3_len = paste(r$Q3, collapse = ";"),
Avg_len = round(total_bases / total_reads, digits = 0),
Min_len = paste(r$min_len, collapse = ";"),
Depth = round(total_bases / ref_len, digits = 2),
Mean_GC = round((r$sum_len[1] * r$GC[1] + r$sum_len[2] * r$GC[2]) / sum(r$sum_len), digits = 2),
stringsAsFactors = FALSE)
} else {
out <- data.frame(Isolate = i,
Read_num = NA,
Sum_len = NA,
Max_len = NA,
Q3_len = NA,
Avg_len = NA,
Min_len = NA,
Depth = NA,
Mean_GC = NA,
stringsAsFactors = FALSE)
}
return(out)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.