#' Merge groups of Fastq /Fasta files
#'
#' Will use multithreading to speed up process.
#' Only works for Unix OS (Linux and Mac)
#' @export
#' @param in_files \code{character} specify the full path to the individual fastq.gz files.
#' Seperated by space per file in group: For 2 output files from 4 input files:
#' in_files <- c("file1.fastq file2.fastq". "file3.fastq file4.fastq")
#' @param out_files \code{character} specify the path to the FASTQ directory
#' For 2 output files: out_files <- c("/merged/file1&2.fastq", "/merged/file3&4.fastq")
#' @inheritParams download.SRA
#' @return invisible(NULL).
#' @export
#' @examples
#' fastq.folder <- tempdir() # <- Your fastq files
#' infiles <- dir(fastq.folder, "*.fastq", full.names = TRUE)
#' \dontrun{
#' # Seperate files into groups (here it is 4 output files from 12 input files)
#' in_files <- c(paste0(grep(infiles, pattern = paste0("ribopool-",
#' seq(11, 14), collapse = "|"), value = TRUE), collapse = " "),
#' paste0(grep(infiles, pattern = paste0("ribopool-",
#' seq(18, 19), collapse = "|"), value = TRUE), collapse = " "),
#' paste0(grep(infiles, pattern = paste0("C11-",
#' seq(11, 14), collapse = "|"), value = TRUE), collapse = " "),
#' paste0(grep(infiles, pattern = paste0("C11-",
#' seq(18, 19), collapse = "|"), value = TRUE), collapse = " "))
#'
#' out_files <- paste0(c("SSU_ribopool", "LSU_ribopool", "SSU_WT", "LSU_WT"), ".fastq.gz")
#' merged.fastq.folder <- file.path(fastq.folder, "merged/")
#' out_files <- file.path(merged.fastq.folder, out_files)
#'
#' mergeFastq(in_files, out_files)
#' }
mergeFastq <- function(in_files, out_files, BPPARAM = bpparam()) {
# TODO: Make it work on windows
if (.Platform$OS.type != "unix") stop("Merge does not work on windows OS")
if (length(in_files) != length(out_files)) stop("Not equal length of in_files and out_files!")
bplapply(seq_along(in_files), function(x, in_files, out_files) {
dir.create(dirname(out_files[x]), showWarnings = FALSE, recursive = TRUE)
system(paste("cat", in_files[x], ">", out_files[x]))
}, in_files = in_files, out_files = out_files, BPPARAM = BPPARAM)
message("Merge Done")
return(invisible(NULL))
}
#' Very fast fastq/fasta collapser
#'
#' For each unique read in the file, collapse into 1 and state in the fasta header
#' how many reads existed of that type. This is done after trimming usually, works
#' best for reads < 50 read length. Not so effective for 150 bp length mRNA-seq etc.
#' @param files paths to fasta / fastq files to collapse. I tries to detect format per file,
#' if file does not have .fastq, .fastq.gz, .fq or fq.gz extensions, it will be treated
#' as a .fasta file format.
#' @param outdir outdir to save files, default:
#' \code{file.path(dirname(files[1]), "collapsed")}.
#' Inside same folder as input files, then create subfolder "collapsed", and add a prefix
#' of "collapsed_" to the output names in that folder.
#' @param header.out.format character, default "ribotoolkit", else must be "fastx".
#' How the read header of the output fasta should be formated: ribotoolkit: ">seq1_x55",
#' sequence 1 has 55 duplicated reads collapsed.
#' fastx: ">1-55", sequence 1 has 55 duplicated reads collapsed
#' @param prefix character, default "collapsed_"
#' Prefix to name of output file.
#' @param compress logical, default FALSE
#' @return invisible(NULL), files saved to disc in fasta format.
#' @export
#' @examples
#'
#' fastq.folder <- tempdir() # <- Your fastq files
#' infiles <- dir(fastq.folder, "*.fastq", full.names = TRUE)
#' # collapse.fastq(infiles)
#'
collapse.fastq <- function(files, outdir = file.path(dirname(files[1]), "collapsed"),
header.out.format = "ribotoolkit", compress = FALSE,
prefix = "collapsed_") {
if (!dir.exists(outdir)) {
dir.create(outdir, showWarnings = FALSE, recursive = TRUE)
}
format <- rep("fasta", length(files))
format[grep("\\.fastq|\\.fq", files)] <- "fastq"
for (f in seq_along(files)) {
file <- files[f]
message("File ", f, "/", length(files), ": ", file)
fasta_name <- gsub(pattern = "\\.fastq$", replacement = ".fasta",
basename(file))
new_file_name <- paste0(prefix, fasta_name,
ifelse(compress, ".gz", ""))
writeXStringSet(collapse.fastq.internal(
readDNAStringSet(file, format = format[f], use.names = FALSE), header.out.format),
file.path(outdir, new_file_name),
compress = compress, format = "fasta")
}
return(invisible(NULL))
}
collapse.fastq.internal <- function(seqs, header.out.format = "ribotoolkit") {
# Fast collapser using data.table
replicates <- data.table(seqs = as.character(seqs))
# Much faster with 1 core actually, strange..
old_threads <- data.table::getDTthreads()
data.table::setDTthreads(1)
replicates <- replicates[, .N, by = seqs][order(N, decreasing = TRUE),]
data.table::setDTthreads(old_threads)
if (header.out.format == "fastx") {
headers <- paste0(seq.int(nrow(replicates)), "-", replicates$N)
} else if (header.out.format == "ribotoolkit") {
headers <- paste0("seq", seq.int(nrow(replicates)), "_x", replicates$N)
} else stop("format must be 'fastx' or 'ribotoolkit'")
new_seqs <- replicates$seqs
names(new_seqs) <- headers
return(DNAStringSet(new_seqs, use.names = TRUE))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.