Nothing
#' FASTQ PAIR
#'
#' @description
#' Rewrite paired end fastq files to make sure that all reads have a mate and to
#' separate out singletons.
#'
#' Usually when you get paired end read files you have two files with a /1
#' sequence in one and a /2 sequence in the other (or a /f and /r or just two
#' reads with the same ID). However, often when working with files from a third
#' party source (e.g. the SRA) there are different numbers of reads in each file
#' (because some reads fail QC). Spades, bowtie2 and other tools break because
#' they demand paired end files have the same number of reads.
#'
#' @param fq1,fq2 A string of fastq file path.
#' @param ... `r rd_dots("fastq_pair")`.
#' @param hash_table_size Size of hash table to use.
#' @param max_hash_table_size Maximal hash table size to use.
#' @param fastq_pair `r rd_cmd("fastq_pair")`.
#' @seealso
#' - <https://github.com/linsalrob/fastq-pair>
#'
#' `r rd_seealso()`
#' @inherit exec return
#' @family command
#' @export
fastq_pair <- make_command(
"fastq_pair",
function(
fq1,
fq2,
...,
hash_table_size = NULL,
max_hash_table_size = NULL,
fastq_pair = NULL
) {
assert_string(fastq_pair, allow_empty = FALSE, allow_null = TRUE)
FastqPair$new(
cmd = fastq_pair,
...,
fq1 = fq1,
fq2 = fq2,
hash_table_size = hash_table_size,
max_hash_table_size = max_hash_table_size
)
}
)
FastqPair <- R6Class(
"FastqPair",
inherit = Command,
private = list(
alias = function() "fastq_pair",
setup_help_params = function() "--help",
setup_command_params = function(
fq1,
fq2,
hash_table_size,
max_hash_table_size
) {
assert_string(fq1, allow_empty = FALSE)
assert_string(fq2, allow_empty = FALSE)
if (is.null(hash_table_size)) {
nlines_file <- tempfile()
if (private$verbose) {
cli::cli_inform(
"counting the number of lines of {.path {fq1}}"
)
}
on.exit(file.remove(nlines_file), add = TRUE)
cmd_run(
exec("wc", "-l", fq1),
stdout = nlines_file,
verbose = FALSE
)
hash_table_size <- strsplit(
read_lines(nlines_file, n = 1L),
" ",
fixed = TRUE
)[[c(1L, 1L)]]
hash_table_size <- ceiling(as.integer(hash_table_size) / 4L)
if (private$verbose) {
cli::cli_inform("Using -t {.val {hash_table_size}}")
}
}
if (!is.null(max_hash_table_size)) {
hash_table_size <- min(hash_table_size, max_hash_table_size)
}
c(
arg0("-t", hash_table_size, format = "%d"),
fq1,
fq2,
">",
nullfile()
)
}
)
)
#' @param fastq_files A character of the fastq file paths.
#' @rdname fastq_pair
#' @export
fastq_read_pair <- function(fastq_files) {
header_list <- lapply(fastq_files, function(file) {
header <- read_lines2(file, n = 1L)
strsplit(header, ":| ", perl = TRUE)[[1L]]
})
# @HWI-ST1276:71:C1162ACXX:1:1101:1208:2458 2:N:0:CGATGT
# HWI-ST1276 <- Unique instrument name
# 71 <- Run ID
# C1162ACXX <- Flowcell ID
# 1 <- Flowcell lane
# 1101 <- Tile number within the flowcell lane
# 1208 <- 'x'-coordinate of the cluster within the tile
# 2458 <- 'y'-coordinate of the cluster within the tile
# 2 <- Member of a pair,1 or 2 (paired-end or mate-pair reads only)
# N <- Y if the read fails filter (read is bad), N otherwise
# 0 <- 0 when none of the control bits are on,otherwise it is an even number
# CGATGT -> Index sequence
# In Illumina data, read group IDs are composed using the flowcell name and
# lane number, making them a globally unique identifier across all
# sequencing data in the world.
# Use for BQSR: ID is the lowest denominator that differentiates factors
# contributing to technical batch effects: therefore, a read group is
# effectively treated as a separate run of the instrument in data processing
# steps such as base quality score recalibration (unless you have PU
# defined), since they are assumed to share the same error model.
# https://gatk.broadinstitute.org/hc/en-us/articles/360035890671-Read-groups
# https://samtools.github.io/hts-specs/SAMv1.pdf
# https://angus.readthedocs.io/en/2017/Read_group_info.html
# header <- header_list[[1L]]
# id <- sub("^@", "", paste(header[3:4], collapse = "."), perl = TRUE)
# flowcell_id <- header[[3L]]
# lane_id <- header[[4L]]
# Platform/technology used to produce the reads. Valid values: CAPILLARY,
# DNBSEQ (MGI/BGI), ELEMENT, HELICOS, ILLUMINA, IONTORRENT, LS454, ONT
# (Oxford Nanopore), PACBIO (Pacific Bio-sciences), SOLID, and ULTIMA. This
# field should be omitted when the technology is not in this list (though
# the PM field may still be present in this case) or is unknown
# platform_id <- "ILLUMINA"
# extract pair_id ----------------------------
vapply(
header_list,
function(header) as.integer(header[[8L]]),
integer(1L),
USE.NAMES = FALSE
)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.